#' reading in:
#'   manifesto_clean.csv  (written out from 1-manifesto-cleaning.R)
#' creating new vars:
#'   hawk, hawk_reputation, above_median_hawk, below_median_hawk
#' creating a complete party-year panel
#'   copying over hawk variables from party-election data
#'   filling forward up to ten years from the election when the hawk values were coded
#' writing out (in process_data folder):
#'   manifesto_with_hawkishness.csv
#'   manifesto_party_year.csv 


m = read_csv("process_data/manifesto_clean.csv")  

# adding in party family descriptions, for reference
parfam_descs = tribble(~parfam, ~parfam_desc,
                       10, 'ECO',
                       20, 'LEF',
                       30, 'SOC',
                       40, 'LIB',
                       50, 'CHR',
                       60, 'CON',
                       70, 'NAT',
                       80, 'AGR',
                       90, 'ETH',
                       95, 'SIP',
                       98, 'DIV',
                       999,NA_character_)


m = m |> left_join(parfam_descs)
m |>  select(parfam_desc) |> table()
# AGR CHR CON DIV ECO ETH LEF LIB NAT SIP SOC 
# 182 519 738  16 190 398 579 738 389 159 993



#### CREATING HAWK AND HAWK_REPUTATION VARS ####

#' per101: Foreign Special Relationships: Positive
#' per104: Military: Positive
#' per105: Military: Negative
#' per106: Peace

m = m |> mutate(hawk =  per101 + per104 - per105 - per106 )


# hawk reputation:
#  for each party-election, the average hawk value for that party 
#    over the last five years (including current election)
m$hawk_reputation = NA

# iterating over country-party-election obs
for(i in 1:nrow(m)){
  if(i%%100==0){cat(i, ' ')}
  # store the country name, party name, and election date
  cn = m$country_isocode[i]
  p = m$manifesto_partyid[i]
  cur_elec_date = m$elec_date[i]
  # subset to: this country, this party, and range of elec_dates
  recent_hawk_mean = m |> 
    filter(country_isocode==cn & 
             manifesto_partyid==p & 
             elec_date %in% (as.Date(cur_elec_date)-(5*365):0) 
           # sequence of all dates from 5 years prior, up to cur_elec_date
    ) |> 
    pull(hawk) |>  mean(na.rm=T)
  m$hawk_reputation[i] = recent_hawk_mean
}

cor(m$hawk, m$hawk_reputation, use='complete.obs') # 0.9094629

table(is.na(m$hawk), 
      is.na(m$hawk_reputation))
#       FALSE TRUE
# FALSE  4833    0
# TRUE     30   46


#### CREATING ABOVE/BELOW_MEDIAN_HAWK VARS ####

table(is.na(m$pervote), is.na(m$presvote))
#       FALSE TRUE
# FALSE   129 4590
# TRUE    140   50

cor(m$pervote, m$presvote, use='pairwise.complete.obs') # 0.9644578

# voteshare: pervote, but filling in presvote for 140 obs where pervote is missing 
#  (and leaving the last 50 blank)
m = m |> mutate(
  voteshare = case_when(
    is.na(pervote) ~ presvote,
    T~pervote
  )
)


mce_list = unique(m$country_elec)

# median hawk: 
#  for each election, the hawk_reputation value for the party 
#   that was the median hawk_reputation in this election
median_hawk = sapply(mce_list, function(mce){
  # store the country_isocode and elec_date
  cn = mce %>% str_split(' / ') %>% .[[1]] %>% .[1]
  cur_elec_date = mce %>% str_split(' / ') %>% .[[1]] %>% .[2]
  
  # all party-election obs for this country-election, arranged by hawk_reputation
  msub = m %>% subset(country_elec==mce & !is.na(voteshare)) %>% 
    select(manifesto_partyid, voteshare, hawk_reputation)  %>%
    arrange(hawk_reputation)
  
  # if not multiple parties for this election, median_hawk is NA
  if(nrow(msub)<2){return(NA)}
  
  # total vote percentage (might not sum up to 100, if missing smaller parties)
  total_vote = msub$voteshare %>% sum() 
  
  # cumulative vote pct, ordered from least to most hawk_rep
  msub = msub |> 
    mutate(hawk_cumsum =  voteshare  %>% cumsum())
  
  # which party gets us to (total vote+1)/2
  
  median_hawk_party = msub |> 
    filter(
      hawk_cumsum > (total_vote+1)/2
    ) |> 
    head(1) |> 
    pull(manifesto_partyid)
  
  
  # hawk_rep val for the median_hawk_party
  median_hawk_val = msub |>
    filter(manifesto_partyid==median_hawk_party) |>
    pull(hawk_reputation) 
  
  median_hawk_val
})


# above_median_hawk: 
#  either your hawk_reputation value is strictly above the 
#    voteshare-weighted median hawk_reputation value for this election, 
#  OR you are both the median and the highest hawk_reputation value 
#      (which of course requires that you have at least 50% of vote share)

# below_median_hawk defined analogously

m$above_median_hawk=NA_real_
m$below_median_hawk = NA_real_

for(mce in mce_list){
  # median_hawk is named vector of median_hawk_vals for each election
  median_hawk_val = median_hawk[mce] 
  
  # parties for this election, ordered by hawk_rep (increasing)
  mce_parties = m |>
    filter(country_elec==mce & !is.na(voteshare)) |>
    arrange(hawk_reputation) |> 
    pull(manifesto_partyid)  
  
  if(length(mce_parties)<2){next}
  
  m = m |> mutate(
    above_median_hawk = case_when(
      # hawk_rep > median, or this party is the most hawkish: 1
      country_elec== mce & 
        (hawk_reputation > median_hawk_val | manifesto_partyid == tail(mce_parties,1)) ~ 1,
      # hawk_rep <= median (and not most hawkish): 0
      country_elec== mce & 
        (hawk_reputation <= median_hawk_val ) ~ 0,
      T~above_median_hawk
    ),
    
    below_median_hawk = case_when(
      # hawk_rep < median, or this party is the least hawkish: 1
      country_elec== mce & 
        (hawk_reputation < median_hawk_val | manifesto_partyid == head(mce_parties,1)) ~ 1,
      # hawk_rep >= media (and not least hawkish): 0
      country_elec== mce & (hawk_reputation >= median_hawk_val ) ~ 0,
      T~below_median_hawk
    )
  )
}

m |> select(above_median_hawk, below_median_hawk) |> table()
#           below_median_hawk
# above_median_hawk    0    1
#                 0  846 2266
#                 1 1738    0


# spot checking
check_mce = mce_list[123]

m |> 
  filter(country_elec==check_mce) |>
  arrange(hawk_reputation) |>
  select(country_elec, manifesto_partyid, partyname, voteshare,
         hawk, hawk_reputation, above_median_hawk, below_median_hawk, parfam_desc) 


# checking against party family labels
m |> group_by(parfam_desc) |> 
  summarise(pct_above = mean(above_median_hawk,na.rm=T),
            pct_below = mean(below_median_hawk,na.rm=T),
            n=n()) |> arrange(pct_above)



#### CREATING PARTY-YEAR PANEL ####

# n= 105,908
mpy = expand.grid(
  m |> pull(manifesto_partyid) |> unique(),
  1940:2022
) |> as_tibble() |> 
  rename(manifesto_partyid=1, year=2) |>
  # create mpid_yr identifier in mpy
  mutate(mpid_yr = paste(manifesto_partyid, year, sep='_'))

# create corresponding mpid_yr in m
m = m |> mutate(
  mpid_yr = paste(manifesto_partyid, year, sep='_')
)


# checking for duplicated mpid_yr values within m
duplicated_mpid_yrs = m |> 
  filter(duplicated(mpid_yr)) |> pull(mpid_yr) |> unique()

# duplicated mpid_yr value would mean either 
#  (i) multiple parties in a given election were assigned the same manifesto_partyid value, 
#  or (ii) the same party appears in two elections within the same calendar year

m |> filter(mpid_yr %in% duplicated_mpid_yrs & year > 1960) |>
  group_by(mpid_yr) |> 
  summarise(x = var(above_median_hawk), y=var(below_median_hawk), 
            n=n(), n_elec = n_distinct(elec_date)) |> print(n=Inf)
# n=2 and n_elec=2 for all cases, meaning these are all instances of (ii)

# x=0 (y=0) means that this party has the same above (below) median hawk value for both elections 
#   (there are some cases with x>0 and/or y>0)

# dropping the second instance for each duplicate (i.e. dropping the later election)
m = m |> arrange(mpid_yr) |> filter(!duplicated(mpid_yr))


# merging in country_isocode to mpy, matching by manifesto_partyid
mpy = mpy |> left_join(
  m |> select(manifesto_partyid, country_isocode) |> unique()
)


# merging in hawk vars 
mpy = mpy |> left_join(
  m |>
    select(mpid_yr, # matching by mpid_yr
           above_median_hawk, below_median_hawk, 
           hawk_reputation, hawk
    ) 
)  



# hawk vars are only measured in election years
# want to fill forward, up to 10 years after election

# record whether each obs is the original coding
mpy = mpy |> mutate(
  orig_coding = !is.na(hawk_reputation)
)

mpy |> select(orig_coding) |> table() # 4799 T, 101109 F

# ref_year is the year of orig_coding, filled forward until the next election year
mpy = mpy |> 
  mutate(
    ref_year = case_when(
      orig_coding ~ year,
      T ~ NA_real_
    )
  ) |>
  group_by(manifesto_partyid) |>
  tidyr::fill(ref_year) |> ungroup()  


# now, filling the hawk vars forward
mpy = mpy |>  group_by(manifesto_partyid) |>
  tidyr::fill(hawk, hawk_reputation, 
              above_median_hawk,  below_median_hawk,
              .direction='down') |> ungroup() 

# record whether each year is within 10 years of the ref_year
mpy = mpy |> 
  mutate(
    within_10yr = year - ref_year  <= 10
  )

# drop values that are >10 years after the ref_year
mpy = mpy |> 
  mutate(
    hawk_reputation = case_when(
      within_10yr ~ hawk_reputation,
      T ~ NA_real_
    ),
    hawk = case_when(
      within_10yr ~ hawk,
      T ~ NA_real_
    ),
    above_median_hawk = case_when(
      within_10yr ~ above_median_hawk,
      T ~ NA_real_
    ),
    below_median_hawk = case_when(
      within_10yr ~ below_median_hawk,
      T ~ NA_real_
    )
  )

table(is.na(mpy$hawk), 
      is.na(mpy$hawk_reputation)) 
# 23591 not NA, 82317 NA (full overlap)

# drop obs with missing hawk
mpy = mpy |> filter(!is.na(hawk))



#### WRITING OUT ####

m |> write_csv('process_data/manifesto_with_hawkishness.csv')
mpy |> write_csv('process_data/manifesto_party_year.csv')


rm(list=ls()[ls()!='replication_wd'])
